data(us.cities)
# Get major cities for each sample region (state)
.states <- c("OR", "VT", "CO", "NC")
top.cities <- purrr::map_df(.states, function(s) {
out <- us.cities %>%
filter(country.etc==s) %>%
mutate(city = gsub(paste0(" ", s), "", name)) %>%
arrange(-pop)
if (s == "OR") {
out <- out %>%
head() %>%
filter(!(city %in% c("Gresham", "Hillsboro", "Corvallis",
"Beaverton", "Springfield")))
} else if (s == "CO") {
out <- out %>%
head() %>%
filter(!(city %in% c("Thornton", "Lakewood", "Aurora")))
} else if (s == "NC") {
out <- out %>%
head() %>%
filter(!(city %in% c("Greensboro", "Durham", "Fayetteville")))
} else {
out <- out %>% head()
}
out
})
# Load the map data
states <- map_data("state") %>%
filter(region %in% c("oregon", "north carolina", "colorado", "vermont"))
# Load your data
data.files <- list.files("data/final", full.names = T)
df <- purrr::map_df(data.files, readRDS)
caps.after.ws <- function(string) {
gsub("(?<=\\s)([a-z])", "\\U\\1", string, perl = T)
}
# Define a function to create a plot for each species
plot.for.species <- function(spec, st.abbr) {
st <- case_when(st.abbr == "CO" ~ "colorado",
st.abbr == "NC" ~ "north carolina",
st.abbr == "VT" ~ "vermont",
st.abbr == "OR" ~ "oregon",
T ~ "")
title <- caps.after.ws(paste(st.abbr, gsub("_", " ", spec),
"Observations, 2016-2019"))
p <- ggplot(data = states %>% filter(region == st)) +
geom_polygon(aes(x = long, y = lat, group = group),
fill = "#989875", color = "black") +
geom_point(data = df %>% filter(state == st.abbr & common.name == spec),
aes(x = lon, y = lat),
size=1, alpha=.5, fill = "red", shape=21) +
geom_point(data = top.cities %>% filter(country.etc == st.abbr),
aes(x=long, y=lat),
fill="gold", color="black", size=3.5, shape = 21) +
geom_text(data = top.cities %>% filter(country.etc == st.abbr),
aes(x=long, y=lat, label=city),
color="white", hjust=case_when(st.abbr=="NC"~.2,
st.abbr=="VT"~.65,
T~.5),
vjust=ifelse(st.abbr=="NC", -.65, 1.5),
size=4) +
coord_map() +
ggtitle(title) +
theme_minimal() +
theme(panel.background = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
data.table(
state=st.abbr,
species=spec,
plot=list(p)
)
}
spec.state <- expand.grid(unique(df$common.name), unique(df$state)) %>%
rename(spec=Var1, st.abbr=Var2)
# Create a list of plots
plots <- purrr::map2_df(spec.state$spec,
spec.state$st.abbr,
~plot.for.species(.x, .y))
# Plot Ruddy Duck plots
do.call(ggpubr::ggarrange,
c(plots[species == "Ruddy Duck"]$plot,
list(nrow=2, ncol=2)))
# Plot Belted Kingfisher plots
do.call(ggpubr::ggarrange,
c(plots[species == "Belted Kingfisher"]$plot,
list(nrow=2, ncol=2)))
# Plot Wild Turkey plots
do.call(ggpubr::ggarrange,
c(plots[species == "Wild Turkey"]$plot,
list(nrow=2, ncol=2)))
# Plot Sharp-Shinned Hawk plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sharp-shinned Hawk"]$plot,
list(nrow=2, ncol=2)))
# Plot Downy Woodpecker Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Downy Woodpecker"]$plot,
list(nrow=2, ncol=2)))
# Plot Cedar Waxwing Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Cedar Waxwing"]$plot,
list(nrow=2, ncol=2)))
# Plot Sandhill Crane Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sandhill Crane"]$plot,
list(nrow=2, ncol=2)))
# Plot Sanderling Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sanderling"]$plot,
list(nrow=2, ncol=2)))
states <- c("CO", "NC", "OR", "VT")
r.files <- paste0("data/final_rasters/", states, ".tif")
r.list <- purrr::map(r.files, rast)
names(r.list) <- states
TODO: - terra::freq - terra::density -
terra::layerCor
r.df <- map_df(states, function(s) {
df <- r.list[[s]] %>% as.data.frame()
names(df) <- names(df) %>% gsub(paste0("_", s), "", .)
df %>% setDT()
df[, state := factor(s, levels=states)]
df[apply(df, 1, function(.x) !any(is.na(.x)))]
})
# Custom function to process factor levels
clean.levels <- function(x) {
# Remove non-alphanumeric characters and replace with underscores
x <- gsub("[^a-zA-Z0-9]", "_", x)
# Convert to lowercase
x <- tolower(x)
# Remove any leading or trailing underscores
x <- gsub("^_|_$", "", x)
x <- gsub("__", "_", x)
x <- gsub("NLCD_Land_", "", x)
return(x)
}
r.df[, NLCD_Land := factor(clean.levels(levels(NLCD_Land))[NLCD_Land])]
# Convert factor columns to dummy variables
df.dummies <- data.table(model.matrix(~ . - 1, data = r.df[, .(NLCD_Land, state)])) %>%
cbind(r.df[, -which(names(r.df) %in% c("NLCD_Land", "state")), with=F])
names(df.dummies) <- gsub("NLCD_Land", "", names(df.dummies))
# Ensure that there is more than one value per column (remove otherwise)
uniq.1 <- t( df.dummies[, lapply(.SD, uniqueN)]) %>%
as.data.frame() %>%
filter(V1 == 1) %>%
row.names()
if (length(uniq.1) >= 1) {
df.dummies <- df.dummies[, -which(names(df.dummies) %in% uniq.1), with=F]
}
pca.fit <- PCA(df.dummies, graph=F)
plot.PCA(pca.fit, choix="var")
res <- pca.fit$var$coord %>%
as.data.frame() %>%
mutate(var=as.factor(rownames(.))) %>%
select(var, everything()) %>%
as_tibble()
rownames(res) <- NULL
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
geom_bar(stat = "identity", fill="darkblue") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
theme_minimal()
p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
geom_bar(stat = "identity", fill="darkred") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
theme_minimal()
ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)
famd.fit <- FAMD(r.df, graph=F)
ggpubr::ggarrange(plotlist=purrr::map(
c("var", "quanti", "quali"),
~plot.FAMD(famd.fit, choix=.x)),
ncol=1, nrow=3)
res <- famd.fit$var$coord %>%
as.data.frame() %>%
mutate(var=as.factor(rownames(.))) %>%
select(var, everything()) %>%
as_tibble()
rownames(res) <- NULL
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
geom_bar(stat = "identity", fill="darkblue") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
theme_minimal()
p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
geom_bar(stat = "identity", fill="darkred") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
theme_minimal()
ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)
# Set up output directory
output.dir <- "artifacts/masks_5k"
if (!dir.exists("artifacts")) dir.create("artifacts")
if (!dir.exists(output.dir)) dir.create(output.dir)
# LOAD DATA
# Get raster data
states <- c("CO", "NC", "OR", "VT")
r.list <- purrr::map(paste0("data/final_rasters/", states, ".tif"), rast)
names(r.list) <- states
# Get observation data
obs.df <- list.files("data/final", full.names = T) %>%
purrr::map_df(readRDS) %>%
select(state, common.name, observation.point=geometry)
# BUFFERING RASTER DATA
dist <- 5e3
mask.update <- function(i, mask.raster, obs.df, obs.field="observation.point",
dist=10000, u="m") {
obs.pt <- st_transform(obs.df[i, "observation.point"], st_crs(mask.raster))
# Create a buffer around the point, ensuring correct units
buf <- st_buffer(obs.pt, dist=units::as_units(paste(dist, u)))
return(terra::rasterize(buf, mask.raster, update=T, field=1))
}
# For each observation point, you can now create a distance
# raster and then mask cells within the buffer distance
get.buffered.zones <- function(r, obs.df, obs.field="observation.point",
dist=10000, u="m") {
# Create an empty raster with the same extent and resolution as r
mask.raster <- terra::rast(r)
# Recursively update mask.raster with additional buffered regions
for(i in 1:nrow(obs.df)) {
mask.raster <- mask.update(i, mask.raster, obs.df,
obs.field=obs.field,
dist=dist, u=u)
gc()
}
return(mask.raster)
}
# Get masks by state, species
masks <- purrr::map(states, function(state) {
specs <- sort(unique(obs.df$common.name))
spec.masks <- purrr::map(specs, function(spec, st=state) {
fname <- file.path(output.dir, paste0(st, "_", spec, ".tif"))
if (file.exists(fname)) {
cat("Reading", spec, st, "mask from", fname, "\n")
r.mask <- rast(fname)
} else {
cat("Computing", spec, st, "mask, and saving to", fname, "\n")
r.mask <- get.buffered.zones(r=r.list[[st]],
obs.df=filter(obs.df, state == st &
common.name == spec),
dist=dist)
terra::writeRaster(r.mask, fname, overwrite=T)
}
gc()
r.mask
}, .progress=T)
names(spec.masks) <- specs
spec.masks
})
names(masks) <- states
# Function to sample n points from the non-masked parts
sample.inverse.mask <- function(r.original, r.mask, n,
species, state,
sample.crs=4326, min.n=500,
output.dir="artifacts/pseudo_absence_regions") {
if (!dir.exists(output.dir)) dir.create(output.dir)
output.path <- file.path(output.dir,
gsub(" |\\-", "_",
paste0(
paste(state, tolower(species), sep="_"),
".tif")
))
if (!file.exists(output.path)) {
# Get inverse mask;
# Set NA cells to 0, keep 0 cells as 0, change other cells to 1
r.inverse <- terra::ifel(is.na(r.mask), 0, r.mask)
# Set 0 to 1 and everything else to NA
r.inverse <- terra::lapp(r.inverse, fun = function(x) ifelse(x == 0, 1, NA))
# Crop so that anything outside of the state is NA
r.cropped <- terra::crop(r.inverse, terra::ext(r.original))
# Create a binary raster from r.original where valid values are
# set to 1 and NA values remain NA
r.binary <- terra::lapp(r.original[[1]],
fun = function(x) ifelse(!is.na(x), 1, NA))
# Multiply the cropped raster by the binary raster to ensure
# outside values are set to NA
r.final <- r.cropped * r.binary
terra::writeRaster(r.final, output.path, overwrite=T)
} else {
r.final <- terra::rast(output.path)
}
# Convert the raster to SpatialPoints
gdf <- terra::as.points(r.final) %>%
st_as_sf() %>%
st_transform(crs=sample.crs)
if (nrow(gdf) > 0) {
gdf <- gdf %>%
filter(!is.na(layer)) %>%
select(geometry)
} else {
return(gdf)
}
# Set to min.n size if n < min.n
if (n < min.n) n <- min.n
# Make sure there is sufficient available sample points
if (n > nrow(gdf)) n <- nrow(gdf)
# Randomly sample n points from the available (non-masked) space
sample.idx <- sample(1:nrow(gdf), n)
samples <- gdf %>%
slice(sample.idx) %>%
mutate(common.name = species,
state = state,
lon = NA,
lat = NA,
observations=0)
# Populate lon and lat values:
coords <- st_coordinates(samples)
samples$lon <- coords[, "X"]
samples$lat <- coords[, "Y"]
return(samples)
}
# Get totals by species and state
totals <- obs.df %>%
as_tibble() %>%
select(state, common.name) %>%
group_by(state, common.name) %>%
summarize(N=n(), .groups="keep")
# Create a list of pseudo absence points, by species and state,
# where the sample number `n` >= 500 | `n` == the total observed
# for each respective state and species
pseudo.absence.pts <- list()
for (st in states) {
r.original <- r.list[[st]]
r.masks <- masks[[st]]
pseudo.absence.pts[[st]] <- list()
for (spec in names(r.masks)) {
r.mask <- r.masks[[spec]]
n <- totals %>% filter(state == st & common.name == spec) %>% pull(N)
cat("Generating", n, "pseudo-absence points for the", spec, "in", st, "\n")
pseudo.absence.pts[[st]][[spec]] <- sample.inverse.mask(
r.original, r.mask, spec, st, n=n, sample.crs=4326)
cat("\tGenerated", nrow(pseudo.absence.pts[[st]][[spec]]), "/", n, "points.\n")
}
}
# Extract raster values for each point
if (!dir.exists(file.path("data", "final_pseudo_absence"))) {
dir.create(file.path("data", "final_pseudo_absence"))
}
for (state in states) {
out.file.all <- file.path("data", "final_pseudo_absence", paste0("pa_", state, ".rds"))
if (!file.exists(out.file.all)) {
r <- r.list[[state]]
cat(sprintf("Extracting points to values for %s...\n", state))
# Load observations shapefile
geo.df <- pseudo.absence.pts[[state]] %>% do.call("rbind", .)
rownames(geo.df) <- NULL
geo.df.crs <- st_crs(geo.df)
# Define target CRS and update
target.crs <- st_crs(r)
cat(sprintf("Updating CRS for %s...\n", state))
geo.df <- st_transform(geo.df, target.crs)
# Extract raster values
for (r.name in names(r)) {
cat("\tExtracting", r.name, "values for", state, "\n")
x <- terra::extract(r[[r.name]], geo.df)[[r.name]]
geo.df[[gsub(paste0("_", state), "", r.name)]] <- x
}
# Update crs back
geo.df <- st_transform(geo.df, geo.df.crs)
# Fix names; Filter NA values
geo.df <- geo.df %>%
filter(dplyr::if_all(names(.), ~!is.na(.))) %>%
suppressWarnings()
saveRDS(geo.df, out.file.all)
cat("--------------\n")
}
}
# Get all pseudo-absence data
abs.df <- list.files(file.path("data", "final_pseudo_absence"), full.names = T) %>%
purrr::map_df(readRDS) %>%
select(state, common.name, observation.point=geometry)
# There might be some slight differences since there are occasionally NA values
abs.df %>%
as_tibble() %>%
select(state, common.name) %>%
group_by(state, common.name) %>%
summarize(psuedo.absence.N=n(), .groups="keep") %>%
left_join(totals, by=c("state", "common.name")) %>%
rename(observed.N = N) %>%
knitr::kable()
| state | common.name | psuedo.absence.N | observed.N |
|---|---|---|---|
| CO | Belted Kingfisher | 4523 | 4551 |
| CO | Cedar Waxwing | 3431 | 3446 |
| CO | Downy Woodpecker | 7440 | 7511 |
| CO | Ruddy Duck | 1715 | 1726 |
| CO | Sanderling | 497 | 131 |
| CO | Sandhill Crane | 1524 | 1532 |
| CO | Sharp-shinned Hawk | 2241 | 2257 |
| CO | Wild Turkey | 2597 | 2611 |
| NC | Belted Kingfisher | 4042 | 4183 |
| NC | Cedar Waxwing | 4059 | 4195 |
| NC | Downy Woodpecker | 10415 | 10914 |
| NC | Ruddy Duck | 1076 | 1106 |
| NC | Sanderling | 488 | 311 |
| NC | Sandhill Crane | 489 | 118 |
| NC | Sharp-shinned Hawk | 1211 | 1254 |
| NC | Wild Turkey | 2261 | 2372 |
| OR | Belted Kingfisher | 5803 | 5837 |
| OR | Cedar Waxwing | 8405 | 8446 |
| OR | Downy Woodpecker | 8529 | 8576 |
| OR | Ruddy Duck | 1996 | 2010 |
| OR | Sanderling | 496 | 258 |
| OR | Sandhill Crane | 2443 | 2458 |
| OR | Sharp-shinned Hawk | 2696 | 2714 |
| OR | Wild Turkey | 2429 | 2440 |
| VT | Belted Kingfisher | 1956 | 2033 |
| VT | Cedar Waxwing | 1098 | 3938 |
| VT | Downy Woodpecker | 1598 | 4635 |
| VT | Ruddy Duck | 490 | 51 |
| VT | Sanderling | 493 | 39 |
| VT | Sandhill Crane | 492 | 76 |
| VT | Sharp-shinned Hawk | 730 | 748 |
| VT | Wild Turkey | 2090 | 2181 |
This is a common measure of global spatial autocorrelation. A positive Moran’s I suggests clustering, a negative value suggests dispersion, and a value near zero suggests randomness. In other words, this is randomization approach to test for spatial autocorrelation. It is essentially checking if the data has a spatial pattern that is significantly different from what would be expected if the values were randomly distributed in space.
Moran I statistic: This is the calculated Moran’s I value for the data, which quantifies the degree of spatial autocorrelation. A positive value indicates positive autocorrelation (similar values are closer together), while a negative value indicates negative autocorrelation (dissimilar values are closer together). A value close to zero indicates a random spatial pattern.
Moran I statistic standard deviate: This value represents the standardized Moran’s I value. The larger the absolute value of this statistic, the stronger the evidence against the null hypothesis (of no spatial autocorrelation). A positive value indicates positive spatial autocorrelation, suggesting clustering of similar values.
P-value: This is the probability of observing a Moran’s I value as extreme as, or more extreme than, the one computed from the data, assuming the null hypothesis of no spatial autocorrelation is true. A very small p-value provides strong evidence to reject the null hypothesis, indicating significant spatial autocorrelation in your data.
Expectation: This is the expected Moran’s I value under the null hypothesis of no spatial autocorrelation. For a large dataset, it’s typically close to zero.
Variance: This is the variance of the Moran’s I statistic under the null hypothesis.
# Get observation data
df <- list.files(file.path("data", "final"), full.names = T) %>%
purrr::map_df(readRDS) %>%
select(state, common.name, observations, geometry) # We only interested in these data here
states <- sort(unique(df$state))
species <- sort(unique(df$common.name))
# Function to extract the desired results from output
extract.data <- function(st, spec, results) {
data_frame(
state = st,
species = spec,
statistic = results$statistic,
p.value = results$p.value,
moran.i.statistic = results$estimate['Moran I statistic'],
expectation = results$estimate['Expectation'],
variance = results$estimate['Variance']
)
}
# Define the k for knn computation
k <- 5
# Perform test by state, by species
mi.results <- list()
for (st in states) {
mi.results[[st]] <- list()
for (spec in species) {
cat("Doing Moran's test for", spec, "in", st, "\n")
# Filter data
d <- df %>% filter(common.name == spec & state == st)
mi.results[[st]][[spec]]$data <- d
# Fit knn model
knn <- d %>%
knearneigh(k = k)
mi.results[[st]][[spec]]$knn <- knn
# Create a neighbor's list
nb <- knn %>%
knn2nb()
mi.results[[st]][[spec]]$nb <- nb
# Create a spatial weights matrix
listw <- nb2listw(nb)
mi.results[[st]][[spec]]$listw <- listw
# Compute Moran's I
results <- moran.test(d$observations, listw)
mi.results[[st]][[spec]]$moran.test.results <- extract.data(st, spec, results)
}
}
spec.stat <- expand.grid(species=species, state=states, stringsAsFactors=F)
mi.results.df <- purrr::map(1:nrow(spec.stat), function(i) {
spec <- spec.stat[i, ]$species
st <- spec.stat[i, ]$state
mi.results[[st]][[spec]]$moran.test.results
}) %>%
do.call("rbind", .) %>%
as_tibble() %>%
# Compute adjusted p-value, accounting for multiple comparisons
mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
mi.results.df %>%
filter(adj.p.value <= 0.05) %>%
arrange(-moran.i.statistic)
The Moran’s I statistic for the Belted Kingfisher and Cedar Waxwing are positive in Oregon, as well as the Sharp-shinned Hawk in both Vermont and Oregon. This suggests that locations with high observations of these species are close to other locations with high observations, and the same for low observations. In simple terms, the observation patterns of these species are clustered.
Below is an example of the actual observations of the Sharp-shinned Hawk in Vermont against the spatial lag (i.e., the weighted sum of the neighboring values at each point). Consider the following when interpreting the plot:
# Get spatial weights list
vt.ssh <- mi.results[["VT"]][["Sharp-shinned Hawk"]]$data
# Calculate the spatial lag of the observations
vt.ssh$spatial.lag <- lag.listw(
mi.results[["VT"]][["Sharp-shinned Hawk"]]$listw,
vt.ssh$observations)
# Scatter plot for the Sharp-Shinned Hawk in Vermont
ggplot(vt.ssh, aes(x=observations, y=spatial.lag)) +
geom_point() +
geom_smooth(method="lm", col="red") + # Adds a linear regression line
ggtitle("Moran Scatter Plot for Sharp-shinned Hawk in VT") +
xlab("Observations (log10 Scale)") +
ylab("Spatial Lag of Observations (log10 Scale)") +
scale_y_log10() + scale_x_log10()
env.df.list <- list()
for (state in states) {
r <- r.list[[state]]
gdf <- terra::as.points(r) %>%
st_as_sf() %>%
st_transform(crs=4326)
# Fix names
names(gdf) <- gsub(paste0("_", state, "$"), "", names(gdf))
# Convert land cover to binary variables
binary.lc <- caret::dummyVars(~NLCD_Land, data=gdf, sep=".") %>%
predict(., gdf) %>%
as.data.frame()
names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>%
gsub("\\/| |,", "_", .) %>%
gsub("__", "_", .) %>%
tolower()
gdf <- gdf %>% select(-NLCD_Land) %>% cbind(binary.lc)
env.df.list[[state]] <- gdf
}
# Perform test by state, by environmental variable
env.mi.results <- list()
output.dir <- file.path("artifacts", "env_autocor_morans")
if (!dir.exists(output.dir)) dir.create(output.dir)
for (st in states) {
env.mi.results[[st]] <- list()
gdf <- env.df.list[[st]]
env.vars <- names(gdf)[names(gdf) != "geometry"]
for (e in env.vars) {
output.path <- file.path(output.dir, paste0(st, "_", e, ".rds"))
if (!file.exists(output.path)) {
cat("Doing Moran's test for", e, "in", st, "\n")
# Filter data
d <- gdf %>% select(!!sym(e))
# env.mi.results[[st]][[e]]$data <- d
n.distinct <- d %>% pull(!!sym(e)) %>% unique() %>% length()
if (n.distinct > 1) {
# Fit knn model
knn <- d %>%
knearneigh(k = k)
# env.mi.results[[st]][[e]]$knn <- knn
# Create a neighbor's list
nb <- knn %>%
knn2nb()
# env.mi.results[[st]][[e]]$nb <- nb
# Create a spatial weights matrix
listw <- nb2listw(nb)
# env.mi.results[[st]][[e]]$listw <- listw
# Compute Moran's I
results <- moran.test(d[[e]], listw)
env.mi.results[[st]][[e]]$moran.test.results <- extract.data(st, e, results)
}
cat("\tSaving results...\n")
saveRDS(env.mi.results[[st]][[e]]$moran.test.results, output.path)
} else {
cat("Getting Moran's test results for", e, "in", st, "\n")
env.mi.results[[st]][[e]]$moran.test.results <- readRDS(output.path)
}
}
}
env.stat <- expand.grid(env=names(env.df.list$CO )[names(env.df.list$CO) != "geometry"],
state=states, stringsAsFactors=F)
env.mi.results.df <- purrr::map(1:nrow(env.stat), function(i) {
e <- env.stat[i, ]$env
st <- env.stat[i, ]$state
env.mi.results[[st]][[e]]$moran.test.results
}) %>%
do.call("rbind", .) %>%
as_tibble() %>%
# Compute adjusted p-value, accounting for multiple comparisons
mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
env.mi.results.df %>%
filter(adj.p.value <= 0.05) %>%
arrange(-moran.i.statistic)
The weather layers in particular have Moran’s I values close to 1, which is the maximum possible value. This indicates a very strong spatial clustering of these variables.
all.columns <- unique(unlist(lapply(env.df.list, names)))
env.df <- lapply(env.df.list, function(df) {
missing.cols <- setdiff(all.columns, names(df))
for (col in missing.cols) {
df[[col]] <- NA
}
return(df)
}) %>%
do.call("rbind", .) %>%
as.data.frame() %>%
select(-geometry, -unclassified)
corr.matrix <- cor(env.df, use="na.or.complete")
eigenvalues <- eigen(corr.matrix)$values
ci.df <- tibble(
variable=names(env.df),
condition.index=sqrt(max(eigenvalues) / eigenvalues)
)
ci.df
# Extract the eigenvectors
eigenvectors <- eigen(corr.matrix)$vectors
threshold <- 30
large.ci.indices <- which(sqrt(max(eigenvalues) / eigenvalues) > threshold)
# Examine the eigenvectors
for (index in large.ci.indices) {
cat(paste("Eigenvalue:", eigenvalues[index]), "\n")
cat(paste("Condition Index:",
sqrt(max(eigenvalues) / eigenvalues[index])), "\n")
# Sorting eigenvector components by absolute magnitude for clarity
ev <- eigenvectors[, index]
sorted.ev <- sort(abs(ev), decreasing = T)
ev.top <- sorted.ev[1:5] %>%
as_tibble() %>%
mutate(variable=rownames(corr.matrix)[order(-abs(ev))][1:5]) %>%
select(variable, value)
cat("\nTop 5 contributors to multicollinearity at the condition idx:\n")
for (row in 1:5) {
cat("\t", ev.top[row,]$variable, ":", signif(ev.top[row,]$value, 3), "\n")
}
cat("\n------------------------\n")
}
## Eigenvalue: 0.00135666182336175
## Condition Index: 65.3042479620253
##
## Top 5 contributors to multicollinearity at the condition idx:
## avg_prcp : 0.814
## tmax : 0.459
## tmin : 0.356
## dem : 0.0108
## Winter_NDVI : 0.00657
##
## ------------------------
## Eigenvalue: 1.97064586234132e-15
## Condition Index: 54184234.4382929
##
## Top 5 contributors to multicollinearity at the condition idx:
## shrub_scrub : 0.508
## evergreen_forest : 0.5
## herbaceous : 0.473
## cultivated_crops : 0.318
## deciduous_forest : 0.207
##
## ------------------------
corrplot::corrplot(corr.matrix)
First, create a space to output any “engineered” rasters:
output.dir <- "artifacts/feature_engineering"
if (!dir.exists(output.dir)) dir.create(output.dir)
Recall the hierarchical categories for the land cover data:
Using these categories, feature reduction can be accomplished using a heuristic methodology. Other redundant rasters, such as the Waterbody and Urban Imperviousness rasters, can also be combined with the respective related land cover rasters to further reduce multicollinearity between rasters.
create.parent.category.rasters <- function(input.raster,
state,
output.dir,
layer.name="NLCD_Land") {
# Define the category mappings
categories <- list(
water = c("Open Water"),
ice.snow = c("Perennial Ice/Snow"),
developed = c("Developed, Open Space",
"Developed, Low Intensity",
"Developed, Medium Intensity",
"Developed, High Intensity"),
barren = c("Barren Land"),
forest = c("Mixed Forest",
"Deciduous Forest",
"Evergreen Forest"),
shrubland = c("Shrub/Scrub", "Dwarf Shrub"),
herbaceous = c("Herbaceous", "Grassland/Herbaceous",
"Sedge/Herbaceous", "Lichens", "Moss"),
planted.cultivated = c("Cultivated Crops", "Hay/Pasture"),
wetlands = c("Woody Wetlands", "Emergent Herbaceous Wetlands")
)
# Iterate through each category to create and save the binary raster
for (cat.name in names(categories)) {
# Define the output file path
output.file <- file.path(output.dir, paste0(state, "_", cat.name, ".tif"))
if (!file.exists(output.file)) {
cat("Generating raster for", cat.name, "in", state, "\n")
if (cat.name != "developed") {
# Create a binary raster for the current category
out.raster <- terra::lapp(input.raster[[layer.name]],
fun = function(x) {
case_when(
is.na(x) ~ NA,
x %in% categories[[cat.name]] ~ 1.,
T ~ 0)
})
if (cat.name == "water") {
# Combine with waterbody raster layer
wb.raster <- input.raster[[paste0("waterbody_", state)]]
out.raster <- (out.raster + wb.raster) / 2
}
} else {
# Set developed raster to be a value, scale ranging from 0.25-1 by 0.25
out.raster <- terra::lapp(input.raster[[layer.name]],
fun = function(x) {
case_when(
is.na(x) ~ NA,
x == "Developed, Open Space" ~ 0.25,
x == "Developed, Low Intensity" ~ 0.5,
x == "Developed, Medium Intensity" ~ 0.75,
x == "Developed, High Intensity" ~ 1.,
T ~ 0)
})
# Combine with urban imperviousness
ui.raster <- input.raster[[paste0("urban_imperviousness_", state)]]
ui.min <- terra::minmax(ui.raster)[[1]]
ui.max <- terra::minmax(ui.raster)[[2]]
ui.raster.scaled <- (ui.raster - ui.min) / (ui.max - ui.min)
out.raster <- (out.raster + ui.raster.scaled) / 2
}
# Save the output raster to the specified output path
writeRaster(out.raster, output.file, overwrite=T)
}
}
}
for (state in states) {
create.parent.category.rasters(r.list[[state]], state, output.dir)
}
# Convert 4 separate NDVI rasters to a single raster
for (state in states) {
output.file.min <- file.path(output.dir, paste0(state, "_min_NDVI.tif"))
output.file.max <- file.path(output.dir, paste0(state, "_max_NDVI.tif"))
if (!file.exists(output.file.min) | !file.exists(output.file.max)) {
r.ndvi <- r.list[[state]][[names(r.list[[state]]) %like% "NDVI"]]
# Parse seasons
for (s in names(r.ndvi)) {
r <- r.ndvi[[s]]
.min <- terra::minmax(r)[[1]]
.max <- terra::minmax(r)[[2]]
# Scale to be from 0 to 1
r.ndvi[[s]] <- (r - .min) / (.max - .min)
}
# Get min/max scaled NDVI values
r.max <- max(r.ndvi)
r.min <- min(r.ndvi)
writeRaster(r.max, output.file.max, overwrite=T)
writeRaster(r.min, output.file.min, overwrite=T)
}
}
# Get the difference between the min and max temperatures as a raster
for (state in states) {
output.file <- file.path(output.dir, paste0(state, "_tdiff.tif"))
if (!file.exists(output.file)) {
# Get difference
r.tdiff <- r.list[[state]][[paste0("tmax_", state)]] -
r.list[[state]][[paste0("tmin_", state)]]
# Get min/max differences
.min <- terra::minmax(r.tdiff)[[1]]
.max <- terra::minmax(r.tdiff)[[2]]
# Scale to be from 0 to 1
r.tdiff <- (r.tdiff - .min) / (.max - .min)
# Write raster
writeRaster(r.tdiff, output.file, overwrite=T)
}
}
These are components derived from the spatial coordinates, which can capture and account for spatial structures in the data. E.g., a polynomial term based on latitude and longitude could account for some of the spatial trend.
for (state in states) {
output.file.lon <- file.path(output.dir, paste0(state, "_lon.tif"))
output.file.lat <- file.path(output.dir, paste0(state, "_lat.tif"))
output.file.lon2 <- file.path(output.dir, paste0(state, "_lon_poly.tif"))
output.file.lat2 <- file.path(output.dir, paste0(state, "_lat_poly.tif"))
output.file.lli <- file.path(output.dir, paste0(state, "_lon_lat_interaction.tif"))
if (!all(file.exists(c(output.file.lon, output.file.lat,
output.file.lon2, output.file.lat2,
output.file.lli)))) {
# Get raster as df
r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>%
rename(lon=x, lat=y) %>%
select(lon, lat, cell) %>%
st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
st_transform(crs=4326) %>%
cbind(st_coordinates(.)) %>%
rename(lon="X", lat="Y")
# Initialize empty raster templates
r.lat <- ext(r.list[[state]]) %>%
rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
r.lon <- ext(r.list[[state]]) %>%
rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
# Fill with lat/lon values
r.lat[r.df$cell] <- r.df$lat
names(r.lat) <- "lat"
r.lon[r.df$cell] <- r.df$lon
names(r.lon) <- "lon"
# Get polynomial and interaction terms
r.lat2 <- r.lat^2
names(r.lat2) <- "lat.sqrd"
r.lon2 <- r.lon^2
names(r.lon2) <- "lon.sqrd"
r.lon.lat <- r.lon * r.lat
names(r.lon.lat) <- "lon.lat.interaction"
# Write outputs
writeRaster(r.lat, output.file.lat, overwrite=T)
writeRaster(r.lon, output.file.lon, overwrite=T)
writeRaster(r.lat2, output.file.lat2, overwrite=T)
writeRaster(r.lon2, output.file.lon2, overwrite=T)
writeRaster(r.lon.lat, output.file.lli, overwrite=T)
}
}
for (state in states) {
output.file <- file.path(output.dir, paste0(state, "_detrend_dem.tif"))
if (!file.exists(output.file)) {
# Get raster as df
r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>%
rename(lon=x, lat=y) %>%
select(lon, lat, cell, !!sym(paste0("dem_", state))) %>%
st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
st_transform(crs=4326) %>%
cbind(st_coordinates(.)) %>%
rename(lon="X", lat="Y", dem=paste0("dem_", state)) %>%
# Convert to data.table
setDT()
# Fit model based on location, with dem as response
fit <- lm(dem ~ lat * lon + I(lat^2) + I(lon^2),
data=r.df[!is.na(dem)])
# Get residuals from the model as "de-trended" dem values
r.df[!is.na(dem), dem.detrended := residuals(fit)]
# Scale de-trended values
r.df[, dem.detrended := (dem.detrended - min(dem.detrended, na.rm=T)) /
(max(dem.detrended, na.rm=T) - min(dem.detrended, na.rm=T))]
# Initialize empty raster template
r.dem <- ext(r.list[[state]]) %>%
rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
# Fill with de-trended dem values
r.dem[r.df$cell] <- r.df$dem.detrended
names(r.dem) <- "dem.detrended"
# Write outputs
writeRaster(r.dem, output.file, overwrite=T)
}
}
# Combine observation/pseudo-absence data
df <- c(list.files(file.path("data", "final_pseudo_absence"), full.names = T),
list.files(file.path("data", "final"), full.names = T)) %>%
purrr::map_df(readRDS)
stratified.split.idx <- function(df, p=0.7, lat.lon.bins=25) {
# Cut along lat/lon values to create grids (lat.bin & lon.bin)
# lat.lon.bins is the number of divisions you want
df$lat.bin <- cut(df$lat, breaks=lat.lon.bins, labels = F)
df$lon.bin <- cut(df$lon, breaks=lat.lon.bins, labels = F)
df$absence <- df$observations == 0
# Create a new variable combining the stratification variables
df %>%
# stratify on lat/lon bins, species, state, and absence
mutate(strata = paste(lat.bin, lon.bin, common.name, state, absence)) %>%
pull(strata) %>%
# Create the data partitions
createDataPartition(., p = p, list = F) %>%
suppressWarnings()
}
prepare.data <- function(df, p=.7, lat.lon.bins=25) {
train.index <- stratified.split.idx(df, p=p, lat.lon.bins = lat.lon.bins)
df.train <- df[train.index, ]
df.test <- df[-train.index, ]
list(train = df.train,
test = df.test,
index = train.index)
}
train.test <- prepare.data(df, .7)
train <- df[train.test$index,]
test <- df[-train.test$index,]